################
#PSRM: Explaining Support for Redistribution: Social Insurance Systems and Fairness
#
#Observational Analysis ESS
#Part IX: Missing Data
#
#Verena Fetscher
#July 2022
####################

rm(list=ls())

##########################
#Load data
##########################

load("DataFile_09_spending_tax.Rda")


##########################
#Missing Data
##########################
data_sub %>%
  group_by(cntry,year) %>%
  summarise(income=mean(income.dist.th,na.rm=T))%>%
  filter(is.na(income)) %>%
  print(n=10)


#Remove data with missing income year
data_sub<-data_sub[!((data_sub$country=="France"&
                        data_sub$year==2002)|
                       (data_sub$country=="Ireland"&
                          data_sub$year==2002)|
                       (data_sub$country=="Ireland"&
                          data_sub$year==2006)|
                       (data_sub$country=="Portugal"&
                          data_sub$year==2010)),]

data_sub %>%
  group_by(cntry,year) %>%
  summarise(income=mean(income.dist.th,na.rm=T))%>%
  print(n=10)


#Check for missing data in all columns
sapply(data_sub, function(x) sum(is.na(x)))


data_sub %>%
  group_by(cntry,year) %>%
  summarise(risk=mean(our,na.rm=T))%>%
  print(n=100)


data_sub %>%
  group_by(cntry,year) %>%
  summarise(risk=mean(our,na.rm=T))%>%
  print(n=100)


##########################
#Handling Missing Data
##########################

#1) Listwise deletion 
data_sub %>%
  drop_na(income.PPP,redistribution,
          gender,agea,eduyrs,uempl,our) -> data_list

data_list %>%
  group_by(cntry,year) %>%
  summarise(income=mean(income.dist.th,na.rm=T),
            redistribution=mean(redistribution,na.rm=T),
            risk=mean(our,na.rm=T),
            benefits=mean(benefitsAvg,na.rm=T),
            gender=mean(gender,na.rm=T),
            age=mean(agea,na.rm=T),
            education=mean(eduyrs,na.rm=T),
            unemployment=mean(uempl,na.rm=T),
            gini=mean(gini.disp,na.rm=T),
            immigration=mean(immigration,na.rm=T),
            social_spending=mean(social_spending,na.rm=T),
            toptax=mean(toptax,na.rm=T),
            red_taxes=mean(red_taxes,na.rm=T),
            red_transfers=mean(red_transfers,na.rm=T))

data_list%>%
  dplyr::group_by(cntry,year,isco2d) %>%
  dplyr::summarise(n = n(),mean = mean(our,na.rm=T))%>%
  print(n=100)

data_list %>%
  mutate(across(c(isco2d,hincfel,health,lrscale,stflife,rtrd), 
                ~as.numeric(.))) -> data_list


##########################



#2) Multiple Imputation
#Subsetting data with additional variables to improve income prediction in imputation
#hhmmb: Number of people living regularly as member of household (dependent children)
#hincfel: Feeling about household's income nowadays (satisfaction with own income)
#health: Subjective general health (subjective health)
#lrscale: Placement on left right scale (political ideology)
#stflife: How satisfied with life as a whole (life satisfaction)
#rtrd: retired


#Subsetting data
#Remove modified income data (generate anew after imputation)
data_sub%>%
  select(country,year,income.PPP,income.dist.th,
         redistribution,our,isco2d,gender,agea,
         eduyrs,uempl,single,married67,
         benefitsAvg,benLev,benefitsAvgYear,
         hhmmb,hincfel,health,lrscale,
         stflife,rtrd,immigration,gini.disp,
         social_spending,toptax,red_taxes,red_transfers) -> data_sub




x<-sub('.*sub.', '', names(data_sub))
names(data_sub)<-x
rm(x)


#Change variable class from factor to numeric
data_sub %>%
  mutate(across(c(isco2d,hincfel,health,lrscale,stflife,rtrd), 
                ~as.numeric(.))) -> data_sub

#Remove empty factor values for country
data_sub$country <- factor(data_sub$country)
table(data_sub$country)

names(data_sub) 
str(data_sub)
##########################


#Amelia for multiple imputations
#https://gking.harvard.edu/amelia

#Map missing data
#missmap(data_sub)

#Table with missing data
pMiss <- function(x){sum(is.na(x))/length(x)*100}
df01<-apply(data_sub,2,pMiss)
df01

#Remove missing age and gender
data_sub %>%
  drop_na(gender,agea) -> data_sub


#Generate 5 data sets with mutliply imputed data
table(data_sub$isco2d)

#set occupational unemployment rate between 0 and 1
bds <- matrix(c(6, 0, 1), nrow = 1, ncol = 3)

a.out<-amelia(data_sub,m=5,ts="year",cs="country",
              idvars=c("income.dist.th","gini.disp",
                       "immigration","social_spending","toptax",
                       "red_taxes","red_transfers"), 
              #idvars kept in imputed data fame in order to get correct dimensions and recode according to imputed income data afterwards
              ords=c("uempl","redistribution"),
              bounds=bds)

##########################

summary(a.out$imputations$imp1$our)


##########################
#Data inspection
##########################
table(is.na(a.out$imputations$imp1))

summary(a.out)
summary(a.out$imputations$imp1$redistribution)
summary(a.out$imputations$imp1$isco2d)

compare.density(a.out, var = "our")
compare.density(a.out, var = "income.PPP")

plot(a.out,which.vars=3)


#missmap(a.out)
#overimpute(a.out, var = "income.PPP")
#disperse(a.out,m=5)



##########################
#Recode data
##########################

#Income in thousands
a.out$imputations$imp1$income.PPP.th<-a.out$imputations$imp1$income.PPP/1000
a.out$imputations$imp2$income.PPP.th<-a.out$imputations$imp2$income.PPP/1000
a.out$imputations$imp3$income.PPP.th<-a.out$imputations$imp3$income.PPP/1000
a.out$imputations$imp4$income.PPP.th<-a.out$imputations$imp4$income.PPP/1000
a.out$imputations$imp5$income.PPP.th<-a.out$imputations$imp5$income.PPP/1000
summary(a.out$imputations$imp1$income.PPP.th)
summary(a.out$imputations$imp2$income.PPP.th)
summary(a.out$imputations$imp3$income.PPP.th)
summary(a.out$imputations$imp4$income.PPP.th)
summary(a.out$imputations$imp5$income.PPP.th)

#compare
summary(data_list$income.PPP.th)
##########################


typeof(a.out$imputations$imp1$country)
#Generate income distance by country by year
#Data frame 1
a.out$imputations$imp1$cntry_num<-as.numeric(a.out$imputations$imp1$country)
for(i in 1:15){
  for(j in c(2002,2004,2006,2008,2010,2012,2014)) {
    a.out$imputations$imp1$income.dist.th[a.out$imputations$imp1$cntry_num==i&
                                            a.out$imputations$imp1$year==j]<-
      a.out$imputations$imp1$income.PPP.th[a.out$imputations$imp1$cntry_num==i&
                                             a.out$imputations$imp1$year==j]-
      mean(a.out$imputations$imp1$income.PPP.th[a.out$imputations$imp1$cntry_num==i&
                                                  a.out$imputations$imp1$year==j],
           na.rm=T)
  }
}

#check
summary(a.out$imputations$imp1$income.dist.th[a.out$imputations$imp1$country=="Switzerland"&
                                                a.out$imputations$imp1$year==2008])
##########################


#Data frame 2
a.out$imputations$imp2$cntry_num<-as.numeric(a.out$imputations$imp2$country)
for(i in 1:15){
  for(j in c(2002,2004,2006,2008,2010,2012,2014)) {
    a.out$imputations$imp2$income.dist.th[a.out$imputations$imp2$cntry_num==i&
                                            a.out$imputations$imp2$year==j]<-
      a.out$imputations$imp2$income.PPP.th[a.out$imputations$imp2$cntry_num==i&
                                             a.out$imputations$imp2$year==j]-
      mean(a.out$imputations$imp2$income.PPP.th[a.out$imputations$imp2$cntry_num==i&
                                                  a.out$imputations$imp2$year==j],
           na.rm=T)
  }
}

summary(a.out$imputations$imp2$income.dist.th[a.out$imputations$imp2$country=="Switzerland"&
                                                a.out$imputations$imp2$year==2008])
##########################


#Data frame 3
a.out$imputations$imp3$cntry_num<-as.numeric(a.out$imputations$imp3$country)
for(i in 1:15){
  for(j in c(2002,2004,2006,2008,2010,2012,2014)) {
    a.out$imputations$imp3$income.dist.th[a.out$imputations$imp3$cntry_num==i&
                                            a.out$imputations$imp3$year==j]<-
      a.out$imputations$imp3$income.PPP.th[a.out$imputations$imp3$cntry_num==i&
                                             a.out$imputations$imp3$year==j]-
      mean(a.out$imputations$imp3$income.PPP.th[a.out$imputations$imp3$cntry_num==i&
                                                  a.out$imputations$imp3$year==j],
           na.rm=T)
  }
}

summary(a.out$imputations$imp3$income.dist.th[a.out$imputations$imp3$country=="Switzerland"&
                                                a.out$imputations$imp3$year==2008])
##########################


#Data frame 4
a.out$imputations$imp4$cntry_num<-as.numeric(a.out$imputations$imp4$country)
for(i in 1:15){
  for(j in c(2002,2004,2006,2008,2010,2012,2014)) {
    a.out$imputations$imp4$income.dist.th[a.out$imputations$imp4$cntry_num==i&
                                            a.out$imputations$imp4$year==j]<-
      a.out$imputations$imp4$income.PPP.th[a.out$imputations$imp4$cntry_num==i&
                                             a.out$imputations$imp4$year==j]-
      mean(a.out$imputations$imp4$income.PPP.th[a.out$imputations$imp4$cntry_num==i&
                                                  a.out$imputations$imp4$year==j],
           na.rm=T)
  }
}

summary(a.out$imputations$imp4$income.dist.th[a.out$imputations$imp4$country=="Switzerland"&
                                                a.out$imputations$imp4$year==2008])
##########################


#Data frame 5
a.out$imputations$imp5$cntry_num<-as.numeric(a.out$imputations$imp5$country)
for(i in 1:15){
  for(j in c(2002,2004,2006,2008,2010,2012,2014)) {
    a.out$imputations$imp5$income.dist.th[a.out$imputations$imp5$cntry_num==i&
                                            a.out$imputations$imp5$year==j]<-
      a.out$imputations$imp5$income.PPP.th[a.out$imputations$imp5$cntry_num==i&
                                             a.out$imputations$imp5$year==j]-
      mean(a.out$imputations$imp5$income.PPP.th[a.out$imputations$imp5$cntry_num==i&
                                                  a.out$imputations$imp5$year==j],
           na.rm=T)
  }
}

summary(a.out$imputations$imp5$income.dist.th[a.out$imputations$imp5$country=="Switzerland"&
                                                a.out$imputations$imp5$year==2008])
##########################

mi.out<-to_zelig_mi(a.out$imputations$imp1,a.out$imputations$imp2,a.out$imputations$imp3,
                    a.out$imputations$imp4,a.out$imputations$imp5)


##########################
#Save data
##########################

save(data_list,file="DataFile_10_listwise.Rda")
save(mi.out,file="DataFile_11_imputed.Rda")
##########################
